• Summary
I - Libraries
II - Load Data
A - Deal wit NaN
B - Deal with dates
C - Deal with coordinates
III - First Visualization
IV - Machine Learning on Geo Spatial Data
A - KMeans for spatial visualization
B - Work with specific crimes
V - DBSCAN - London drugs possession warning
A - Learning part
B - Visualization part
from sklearn.metrics import pairwise_distances
from __future__ import unicode_literals
from sklearn.cluster import KMeans, DBSCAN
from scipy.spatial import ConvexHull
from scipy.spatial import Voronoi
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import metrics
from PIL import Image
import urllib.request
import pandas as pd
import numpy as np
import pytz as tz
import urllib
import wget
import sys
import os
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#https://www.kaggle.com/sohier/london-police-records/downloads/london-outcomes.csv
df = pd.read_csv("london_crimes.csv")
df.head()
print("Shape with NA:", df.shape)
df = df.dropna()
print("Shape without NA:",df.shape)
B - Deal with dates
df["Timestamp"] = pd.to_datetime(df['Month'], format='%Y-%m', errors='coerce')
df.head()
start = datetime(2015, 1, 1)
end = datetime(2017, 12, 31)
df_cleaned = df[(df.Timestamp > start) & (df.Timestamp < end)].copy()
df_cleaned.shape
df_cleaned = df_cleaned[['Outcome type', 'Latitude', 'Longitude', 'Timestamp']].copy()
df_cleaned.columns = ['Outcome type', 'Latitude', 'Longitude', 'Timestamp']
df_cleaned.head()
C - Deal with coordinates
df_cleaned = df_cleaned[(df_cleaned.Longitude > -0.2)
& (df_cleaned.Longitude < -0.02)
& (df_cleaned.Latitude > 51.44)
& (df_cleaned.Latitude < 51.58)]
df_cleaned.shape
def get_map(x, y, z, size, filename) :
# Load map on staticmap.openstreetmap.de depending on coordinates
static_map = "http://staticmap.openstreetmap.de/staticmap.php?center={0},{1}&zoom={2}&size={3}x{3}&maptype=mapnik".format(y,x,z,size)
static_map_filename, headers = urllib.request.urlretrieve(static_map, filename)
return static_map_filename
def geomap(data, center_coordinates, zoom=13, point_size=3, point_color='r', point_alpha=1):
z, picsize = zoom, 1000
wx = 1.0*360*(picsize/256)/(2**z)
wy = 0.76*360*(picsize/256)/(2**z)
lat_center = center_coordinates[0]
lon_center = center_coordinates[1]
x_min, x_max = lon_center - wx/2, lon_center + wx/2
y_min, y_max = lat_center - wy/2, lat_center + wy/2
static_map_filename = 'london_staticmap_{}_{}.png'.format(z, picsize)
if os.path.isfile(static_map_filename) == False:
get_map(x, y, z, picsize, static_map_filename)
img = Image.open(static_map_filename)
#add the static map
plt.imshow(img, zorder=0, extent=[x_min, x_max, y_min, y_max], interpolation='none', aspect='auto')
#add the scatter plot of events
plt.plot(data['Longitude'], data['Latitude'], '.',
markerfacecolor=point_color, markeredgecolor='k',
markersize=point_size, alpha=point_alpha)
#limit the plot to the given box
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
##### END FUNCTION #####
plt.style.use('seaborn-white')
fig = plt.figure()
fig.set_size_inches(20,20)
london_center = (51.510579, -0.109852)
geomap(df_cleaned[['Longitude','Latitude']], center_coordinates=london_center)
plt.show()
ml = KMeans(n_clusters=100, init='k-means++')
ml.fit(df_cleaned[['Longitude', 'Latitude']].sample(frac=0.3))
cluster = ml.cluster_centers_
cluster[:10]
df_cleaned['cluster'] = ml.predict(df_cleaned[['Longitude','Latitude']])
df_cleaned[['Outcome type','Latitude','Longitude', 'cluster']].head()
def voronoi_polygons_2d(vor, radius=None):
"""
Reconstruct infinite voronoi regions in a 2D diagram to finite
regions.
Input_args:
vor : Voronoi
Input diagram
radius : float, optional
Distance to 'points at infinity'.
:returns:
regions : list of tuples
Indices of vertices in each revised Voronoi regions.
vertices : list of tuples
Coordinates for revised Voronoi vertices. Same as coordinates
of input vertices, with 'points at infinity' appended to the
end.
"""
if vor.points.shape[1] != 2:
raise ValueError("Requires 2D input")
new_regions = []
new_vertices = vor.vertices.tolist()
center = vor.points.mean(axis=0)
if radius is None:
radius = vor.points.ptp().max()*2
# Construct a map containing all ridges for a given point
all_ridges = {}
for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices):
all_ridges.setdefault(p1, []).append((p2, v1, v2))
all_ridges.setdefault(p2, []).append((p1, v1, v2))
# Reconstruct infinite regions
for p1, region in enumerate(vor.point_region):
vertices = vor.regions[region]
if all([v >= 0 for v in vertices]):
# finite region
new_regions.append(vertices)
continue
# reconstruct a non-finite region
ridges = all_ridges[p1]
new_region = [v for v in vertices if v >= 0]
for p2, v1, v2 in ridges:
if v2 < 0:
v1, v2 = v2, v1
if v1 >= 0:
# finite ridge: already in the region
continue
# Compute the missing endpoint of an infinite ridge
t = vor.points[p2] - vor.points[p1] # tangent
t /= np.linalg.norm(t)
n = np.array([-t[1], t[0]]) # normal
midpoint = vor.points[[p1, p2]].mean(axis=0)
direction = np.sign(np.dot(midpoint - center, n)) * n
far_point = vor.vertices[v2] + direction * radius
new_region.append(len(new_vertices))
new_vertices.append(far_point.tolist())
# sort region counterclockwise
vs = np.asarray([new_vertices[v] for v in new_region])
c = vs.mean(axis=0)
angles = np.arctan2(vs[:,1] - c[1], vs[:,0] - c[0])
new_region = np.array(new_region)[np.argsort(angles)]
# finish
new_regions.append(new_region.tolist())
return new_regions, np.asarray(new_vertices)
points = cluster
# compute Voronoi tesselation
vor = Voronoi(points)
regions, vertices = voronoi_polygons_2d(vor)
## PLOT
# prepare figure
plt.style.use('seaborn-white')
fig = plt.figure()
fig.set_size_inches(20,20)
#geomap
geomap(df_cleaned[['Longitude','Latitude']], london_center, 13, 2,'k', 0.1)
# centroids
plt.plot(points[:,0], points[:,1], 'wo',markersize=10)
# colorize
for region in regions:
polygon = vertices[region]
plt.fill(*zip(*polygon), alpha=0.4)
plt.show()
B - Work with specific crimes
df_cleaned['Outcome type'].value_counts()
plt.style.use('seaborn-white')
fig = plt.figure()
fig.set_size_inches(19, 6)
crime_1 = 'Suspect charged'
plt.subplot(1, 3, 1)
geomap(df_cleaned[df_cleaned['Outcome type'] == crime_1][['Longitude', 'Latitude']], london_center)
plt.title(str(crime_1))
crime_2 = 'Offender given a drugs possession warning'
plt.subplot(1, 3, 2)
geomap(df_cleaned[df_cleaned['Outcome type'] == crime_2][['Longitude', 'Latitude']], london_center)
plt.title(str(crime_2))
crime_3 = 'Offender given suspended prison sentence'
plt.subplot(1, 3, 3)
geomap(df_cleaned[df_cleaned['Outcome type'] == crime_3][['Longitude', 'Latitude']], london_center)
plt.title(str(crime_3))
plt.show()
crime = 'Offender given a drugs possession warning'
# eps - The maximum distance between two samples for them to be considered as in the same neighborhood.
# At least 10 points in a cluster
data = df_cleaned[df_cleaned['Outcome type'] == crime][['Longitude','Latitude']]
db = DBSCAN(eps=0.0023, min_samples=10, n_jobs=-1).fit(np.array(data[['Longitude','Latitude']]))
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
data['dbscan_cluster'] = db.labels_
data['dbscan_core'] = core_samples_mask
print("Number of clusters: {}".format(len(set(data['dbscan_cluster']))))
data.head()
B - Visualization part
plt.style.use('seaborn-white')
fig = plt.figure()
fig.set_size_inches(20, 20)
empty = pd.DataFrame(columns=['Longitude', 'Latitude'])
geomap(empty, london_center, 13, 2, 'k', 0.1)
unique_labels = sorted(set(data['dbscan_cluster']))
for k in unique_labels:
xy = data[data['dbscan_cluster'] == k]
plt.plot(xy['Longitude'], xy['Latitude'], 'kD' if k < 0 else 'o', markersize=7)
plt.show()
plt.style.use('seaborn-white')
fig = plt.figure()
fig.set_size_inches(20,20)
empty= pd.DataFrame(columns=['Longitude','Latitude'])
geomap(empty, london_center, 13, 2,'k',0.1)
# convex hulls for evry cluster
for k in unique_labels:
if k>=0:
xy = data[data['dbscan_cluster'] == k][['Longitude','Latitude']].reset_index(drop=True)
try:
hull = ConvexHull(xy.as_matrix())
for simplex in hull.simplices:
plt.plot(xy.iloc[simplex]['Longitude'], xy.iloc[simplex]['Latitude'], 'r-', lw=5)
except:
pass
plt.show()